Internet Info 1994 March

home *** CD-ROM | disk | FTP | other *** search

/ Internet Info 1994 March / Internet Info CD-ROM (Walnut Creek) (March 1994).iso / networking / info-service / wais / ir-book-sources / thesauri / select.c < prev

Wrap

C/C++ Source or Header | 1993-04-08 | 39.7 KB | 1,086 lines

/* PROGRAM NAME: select.c PURPOSE: 1) Compute Discrimination Value of terms, 2) Compute Poisson Distributions for terms, 3) Parition terms by their total within document frequencies, 4) Compute cohesion between pairs of terms, 5) Compute Dice's coefficient of similarity between two terms. INPUT FILES REQUIRED: 1) a direct file, sequences of: document# term weight (multiple entries for any document should be grouped together ) 2) an inverted file, sequences of term document# weight (multiple entries for any term should be grouped together) NOTES: Filters such as stop lists and stemmers should be used before before running this program. PARAMETERS TO BE SET BY USER: 1) MAXWORD - maximum size of a term 2) MAXWDF - maximum value expected for the within document frequency for a term in the collecgtion. 3) LOW_THRESHOLD - threshold for LOW and MID frequency ranges 4) HIGH_THRESHOLD - threshold for MID and HIGH frequency ranges COMMAND LINE: select direct_file inverted_file output_file ****************************************************************************/ #include <stdio.h> #include <string.h> #include <math.h> #define MAXWORD 20 /* maximum size of a term */ #define MAXWDF 30 /* maximum WDF for a word in a database */ #define LOW_THRESHOLD 2.0 #define HIGH_THRESHOLD 4.0 struct termlist { /* sequences of term and weight pairs */ char term[MAXWORD]; /* term */ float weight; /* term weight in document */ struct termlist *nextterm;/* ptr. to next termlist record */ } termlistfile; struct doclist { /* sequences of document # and weight pairs*/ int doc; /* document number */ float weight; /* term weight in document */ struct doclist *nextdoc; /* ptr. to next doclist record */ } doclistfile; struct direct { /* direct file: document to list of terms */ int docnum; /* document # */ struct termlist *terms; /* sequences of term and weight pairs */ struct direct *next; /* ptr. to next direct record */ } directfile; struct invert { /* inverted file: term to list of documnts */ char term[MAXWORD]; /* term */ struct doclist *doc; /* sequences of document # and weight pairs*/ struct invert *next; /* ptr. to next invert record */ } invfile; static struct invert *startinv; /* ptr. to first record in inverted file */ static struct invert *lastinv; /* ptr. to last record in inverted file */ static struct doclist *lastdoc; /* ptr. to last document in doclist */ static struct direct *startdir; /* ptr. to first record in direct file */ static struct direct *lastdir; /* ptr. to last record in direct file */ static struct termlist *lastterm; /* ptr. to last term in termlist */ static struct direct *start_centroid; /* ptr. to centroid record */ static FILE *input; /* direct file */ static FILE *input1; /* inverted file */ static FILE *output; /* file to hold all output */ static int currentdoc; /* tracks current document in direct file */ static char currentterm[MAXWORD]; /* tracks current term in inverted file */ static int Number_of_docs; /* total # of documents which is computed */ static float av_doc_similarity(), /* compute average doc. similarity in dbse. */ factorial(), /* compute factorial of a number */ cosine(), /* compute cosine between two terms */ dice(), /* compute dice beteen two terms */ total_wdf(), /* compute total frequency of term in dbse. */ cohesion(); /* compute cohesion between two terms */ static void initialize(), /* initialize files and global variables */ read_directfile(), /* read in the direct file */ add_direct(), /* called within read_directfile() */ pr_direct(), /* print the direct file */ read_invfile(), /* read in the inverted file */ add_invert(), /* called within read_invfile() */ pr_invert(), /* print the inverted file */ centroid(), /* compute the document centroid for dbse. */ pr_centroid(), /* print the document centroid */ get_Poisson_dist(), /* compute Poisson distributions for terms */ Partition_terms(), /* partition terms by frequency */ dv_all(), /* compute discrimination value of terms */ get_doc_data(), /* get basic info. about documents */ get_term_data(); /* get basic info. about terms */ static struct direct *get_mem_direct(); /* these 4 get_mem functions are */ static struct invert *get_mem_invert(); /* used to obtain memory for a */ static struct doclist *get_mem_doclist(); /* record. The record type is */ static struct termlist *get_mem_termlist(); /* obvious from the name */ struct invert *find_term(); /* searches for term in inverted file*/ /* and returns its address */ int main(argc,argv) int argc; char *argv[]; { char ch, word1[MAXWORD], word2[MAXWORD]; if (argc!=4) { (void) printf("There is an error in the command line\n"); (void) printf("Correct usage is\n"); (void) printf("select direct_file inverted_file output_file\n"); exit(1); } initialize(argv); (void) fprintf(output,"\nREADING IN DIRECT FILE\n"); read_directfile(); (void) fprintf(output,"\nPRINTING DIRECT FILE\n\n"); pr_direct(); (void) fprintf(output,"\nNUMBER OF DOCUMENTS IS: %d\n\n",Number_of_docs); (void) fprintf(output,"\nREADING IN INVERTED FILE\n"); read_invfile(); (void) fprintf(output,"\nPRINTING INVERTED FILE\n"); pr_invert(); (void) printf("\nPlease make a selection\n\n"); (void) printf("To compute DV for all terms enter 1\n"); (void) printf("To compute Poisson distributions enter 2\n"); (void) printf("To partition terms by frequency enter 3\n"); (void) printf("To compute cohesion between two terms (for phrase construction) enter 4\n"); (void) printf("To compute Dice's coefficient between two terms enter 5\n"); (void) printf("To quit enter 6\n\n"); (void) printf("Enter your choice: "); ch = getchar(); switch(ch) { case '1': centroid(); (void) fprintf(output,"\nCENTROID\n\n"); pr_centroid(); (void) fprintf(output,"\nDISCRIMINATION VALUES FOR ALL TERMS\n\n"); dv_all(); break; case '2': (void) fprintf(output,"\nACTUAL AND POISSON DISTRIBUTIONS OF WITHIN DOCUMENT FREQUENCIES FOR ALL TERMS\n\n"); (void) fprintf(output,"WDF = Within Document Frequency & #docs = Number of documents\n\n"); get_Poisson_dist(); break; case '3': (void) printf("Make sure that the threshold parameters are set correctly in the programm\n"); (void) fprintf(output,"\nPARTITIONING THE TERMS INTO LOW, MEDIUM, HIGH FREQUENCY CLASSES\n\n"); Partition_terms(); break; case '4': (void) printf("enter first word: "); (void) scanf("%s",word1); if (find_term(word1) == NULL) { printf("sorry, %s is not in the inverted file\n",word1); break; } (void) printf("enter second word: "); (void) scanf("%s",word2); if (find_term(word2) == NULL) { printf("sorry, %s is not in the inverted file\n",word2); break; } (void) fprintf(output,"Cohesion between %s and %s is %f\n",word1,word2,cohesion(word1,word2)); break; case '5': (void) printf("enter first word: "); (void) scanf("%s",word1); if (find_term(word1) == NULL) { printf("sorry, %s is not in the inverted file\n",word1); break; } (void) printf("enter second word: "); (void) scanf("%s",word2); if (find_term(word2) == NULL) { printf("sorry, %s is not in the inverted file\n",word2); break; } (void) fprintf(output,"Dice's coefficient between %s and %s is %f\n",word1,word2,dice(word1,word2)); break; case '6': exit(0); default: (void) printf("no selection made\n"); } (void) fclose(input1); (void) fclose(input); (void) fclose(output); return(0); } /**************************************************************************** initialize(argv) Returns: void Purpose: Open all required files and initialize global variables **/ static void initialize(argv) char *argv[]; /* in: holds the three parameters input at the command line */ { if (( input = fopen(argv[1],"r")) == NULL ) { (void) printf("couldn't open file %s\n",argv[1]); exit(1); /* input direct file */ } if (( input1 = fopen(argv[2],"r")) == NULL ) { (void) printf("couldn't open file %s\n",argv[2]); exit(1); /* input inverted file */ } if (( output = fopen(argv[3],"w")) == NULL) { (void) printf("couldn't open file %s for output\n",argv[3]); exit(1); /* output file */ } /* set initial values of global variables */ startinv = NULL; lastinv = NULL; lastdoc = NULL; startdir = NULL; lastdir = NULL; lastterm = NULL; start_centroid = NULL; currentdoc = 0; currentterm[0] = '\0'; Number_of_docs = 0; } /****************************************************************************/ /* read_directfile() Returns: void Purpose: Read in the direct file entries from the 1st input file **/ static void read_directfile() { int docid; /* holds the current document number */ char temp[MAXWORD]; /* holds the current term */ float weight; /* holds the current term weight */ struct termlist *p; /* structure to store the term-weight pair */ (void) fscanf(input,"%d%s%f",&docid,temp,&weight); /* read the next line */ while (docid > 0) /* while its found a legitimate document number */ { if (docid == currentdoc) { /* if this document number has previously been entered in direct file */ /* then only need to attach a termlist record to the same entry */ p = get_mem_termlist(); /* get memory for a termlist record */ (void) strncpy(p->term,temp,MAXWORD); /* copy the new word over */ p->weight = weight; /* assign the new weight over */ p->nextterm = NULL; if (lastterm) lastterm->nextterm = p; /* connect p to the termlist chain for this document */ lastterm = p; /* set this global variable */ } else { /* else docid represents a new document */ Number_of_docs = Number_of_docs +1; /* increment global variable */ add_direct(docid,temp,weight); /* starts a brand new entry in */ /* the direct file */ } docid = 0; (void) fscanf(input,"%d%s%f",&docid,temp,&weight); } } /*************************************************************************** add_direct(docid,temp,weight) Returns: void Purpose: Start a new entry in the direct file. It is called in the read_directfile function when a new document number is read from the input file. **/ static void add_direct(docid,temp,weight) int docid; /* in: new document number */ char temp[MAXWORD]; /* in: index term */ float weight; /* in: index term weight */ { struct direct *p; /* structure p will be attached to direct file */ p = get_mem_direct(); /* get memory for p */ p->docnum = docid; /* assign the document number */ p->terms = get_mem_termlist(); /* get memory for termlist structure */ (void) strncpy(p->terms->term,temp,MAXWORD); /* assign index term to it */ p->terms->weight = weight; /* assign term weight to it */ p->terms->nextterm = NULL; /* current end of termlist */ p->next = NULL; /* current end of direct file */ if (startdir == NULL) startdir = p; /* if this is the very first document then global variable pointing to start of direct file should be updated */ if (lastdir) lastdir->next = p; /* update pointer to last direct file rec. */ lastdir = p; lastterm = p->terms; /* update pointer to last term */ currentdoc = docid; /* update the global variable currentdoc to the */ /* document number just added */ } /*************************************************************************** pr_direct() Returns: void Purpose: Print the direct file. It prints sequences of document# term weight. **/ static void pr_direct() { struct direct *dir_addr; /* tracks address of current direct file record */ struct termlist *term_addr; /* tracks address of current termlist record */ dir_addr = startdir; /* start with beginning of direct file */ while (dir_addr) { /* check for legitimate direct file record */ (void) fprintf(output,"DOCUMENT NUMBER: %d \n",dir_addr->docnum); (void) fprintf(output," TERM TERM WEIGHT\n"); term_addr = dir_addr->terms; /* get addr. of first term */ while (term_addr) { /* loop through all the terms */ (void) fprintf(output," %-30s ",term_addr->term); (void) fprintf(output,"%-10.3f\n",term_addr->weight); term_addr = term_addr->nextterm; /* go to next term for the doc. */ } (void) fprintf(output,"\n"); dir_addr = dir_addr->next; /* go to next direct file record */ } } /*************************************************************************** read_invfile() Returns: void Purpose: Read in the inverted file entries from 2nd input file **/ static void read_invfile() { int docid; /* holds currenct document number */ char temp[MAXWORD]; /* holds current term */ float weight; /* holds current term weight */ struct doclist *p; /* structure to store doc#-weight pair */ (void) fscanf(input1,"%s%d%f",temp,&docid,&weight); /* read next line */ while (strlen(temp) > 0) /* while its found a legitimate term */ { if (!strncmp(currentterm,temp,MAXWORD)) { /* if this term has previously been entered in inverted file */ /* then only need to attach a doclist record to same term entry */ p = get_mem_doclist(); /* get memory for doclist record */ p->doc = docid; /* assign document number */ p->weight = weight; /* assign weight */ p->nextdoc = NULL; if (lastdoc) lastdoc->nextdoc = p; /* connect p to the doclist chain for this term */ lastdoc = p; /* set this global variable */ } else add_invert(docid,temp,weight); /* else term is a brand new term & need to make a new inverted file entry */ temp[0] = '\0'; (void) fscanf(input1,"%s%d%f",temp,&docid,&weight); /* read next line */ } } /*************************************************************************** add_invert(docid,temp,weight); Returns: void Purpose: Start a new entry in the inverted file. It is called in the read_invfile function when a new term is read from the input file **/ static void add_invert(docid,temp,weight) int docid; /* in: document number */ char temp[MAXWORD]; /* in: new index term */ float weight; /* in: index term weight */ { struct invert *p; /* structure p will be attached to inverted file */ p = get_mem_invert(); /* get memory for p */ (void) strncpy(p->term,temp,MAXWORD); /* copy over the term */ p->doc = get_mem_doclist(); /* start a doclist structure */ p->doc->doc = docid; /* assign document number */ p->doc->weight = weight; /* assign term weight */ p->doc->nextdoc = NULL; p->next = NULL; if (startinv == NULL) startinv = p; /* if this is the first entry in inverted file, then update global var. */ if (lastinv) lastinv->next = p; /* update ptr. to last inverted file record */ lastinv = p; lastdoc = p->doc; /* update ptr. to last document */ (void) strncpy(currentterm,temp,MAXWORD); /* update global var. currentterm to the */ /* new term just entered */ } /*************************************************************************** pr_invert() Returns: void Purpose: Print the inverted file. It prints sequences of term document# weight. **/ static void pr_invert() { struct invert *inv_addr; /* tracks address of current inverted file record */ struct doclist *doc_addr; /* tracks address of current doclist record */ inv_addr = startinv; /* start with beginning of inverted file */ while (inv_addr) { /* check for legitimate inverted file record */ (void) fprintf(output,"TERM: %s\n",inv_addr->term); (void) fprintf(output," DOCUMENT NUMBER TERM WEIGHT\n"); doc_addr = inv_addr->doc; /* get addr. of first document */ while (doc_addr) { /*loop through all the associated doc.#s and weights*/ (void) fprintf(output," %-30d ",doc_addr->doc); (void) fprintf(output,"%-10.5f\n",doc_addr->weight); doc_addr = doc_addr->nextdoc; /* get addr. of next document */ } (void) fprintf(output,"\n"); inv_addr = inv_addr->next; /* go to next inverted file record */ } } /*************************************************************************** centroid() Returns: void Purpose: Compute and return the centroid for the documents of the database. Notes: Centroid is stored as a direct file record. Document number given to it is Number_of_docs + 1. The centroid is computed by determining for each index term in the inverted file, its average weight in the database. These average weights are then stored in the direct file entry for the centroid. (Note that these average weights are not used anywhere, but could be used in computing DV for terms). **/ static void centroid() { struct invert *inv_addr; /* tracks address of current inverted file record */ struct doclist *doc_addr; /* tracks address of current doclist record */ float total_weight, /* tracks total weight for each term */ av_term_weight; /* holds average term weight for each term */ struct termlist *q, /* structure used to create centroid entry in the direct file */ *lastterm; /* tracks the last term in the centroid */ start_centroid = get_mem_direct(); /* centroid stored as direct file record */ start_centroid->docnum = Number_of_docs +1; /* assign its pseudo doc.# */ start_centroid->next = NULL; /* end of direct file chain */ lastterm = NULL; inv_addr = startinv; /* begin at top of inverted file */ while (inv_addr) { /* while there is a legitimate inv. file record... */ doc_addr = inv_addr->doc; /* get address of first document */ total_weight = 0.0; /* start with a 0 total weight for this term */ while (doc_addr) { /* if this is a legitimate doc. addr. */ total_weight = total_weight + doc_addr->weight; /* update total weight for term */ doc_addr = doc_addr->nextdoc; /* loop through all docs. for the term */ } av_term_weight = total_weight/Number_of_docs; /* calculating average term wt. */ q = get_mem_termlist(); (void) strncpy(q->term,inv_addr->term,MAXWORD); q->weight = av_term_weight; q->nextterm = NULL; if (lastterm == NULL) start_centroid->terms = q; /* if this is the first term entry for the centroid */ else lastterm->nextterm = q; /* else connect this term to the centroid's termlist chain */ lastterm = q; inv_addr = inv_addr->next; /* go on to the next inverted file entry */ } } /*************************************************************************** pr_centroid() Returns: void Purpose: Print the centroid from the direct file **/ static void pr_centroid() { struct termlist *term_addr; /* tracks address of current termlist record */ /* note the centroid is given a document number = Number_of_docs + 1 */ /* therefore it may be treated as a special kind of document vector */ if (start_centroid) { /* if there is a centroid */ (void) fprintf(output,"---------------------------------------\n"); (void) fprintf(output,"TERM WEIGHT \n"); (void) fprintf(output,"---------------------------------------\n"); term_addr = start_centroid->terms; /* get first term address */ while (term_addr) { /* printing out term and weight pairs */ (void) fprintf(output,"%-30s ",term_addr->term); (void) fprintf(output,"%-10.5f\n",term_addr->weight); term_addr = term_addr->nextterm; /* loop through all terms */ } (void) fprintf(output,"\n"); } } /*************************************************************************** get_Poisson_dist() Returns: void Purpose: Get the Poisson distribution data for any term Notes: This function has two parts: PART I: Determine the actual within doc. freq. distribution PART II: Determine the distribution anticipated under the single Poisson model. It is assumed that the within document frequency of a term is stored as the term weight in the inverted file. **/ static void get_Poisson_dist() { struct invert *inv_addr; /* tracks address of current inverted file record */ struct doclist *doc_ad; /* tracks address of current doclist record */ float dist[MAXWDF][2]; /* store for each term */ /* column 1 = within document frequency (wdf) */ /* column 2 = document frequency */ int i, /* counter to add information to the dist array */ j, /* counter to loop through dist array */ found, /* flag used to match wdf in dist array */ numdocs, /* counter to track # of docs. with the same wdf */ docs_with_term; /* tracks the number of documents having the term */ float first, /* these five local variables are */ second, /* used to determine expected distribution */ result, exponent, lambda; /* single Poisson parameter */ inv_addr = startinv; /* start at the beginning of the inverted file */ /* PART I: For each term determine the number of documents in the collection that have a particular wdf */ while (inv_addr) { /* check for legitimate inv. file record */ docs_with_term = 0; doc_ad = inv_addr->doc; /* get the first doc. address */ i = 0; /* used to check if this is the very first entry in dist */ while (doc_ad) { if (i == 0) { /* if first entry in dist */ dist[i][0] = doc_ad->weight; dist[i][1] = 1; /* assign wdf and doc. frequency = 1 to first row in dist */ i++; docs_with_term++; } else { /* dist already has other entries, hence look for any previous entries for the same wdf value */ found = 0; for (j=0;j<i;j++) { /* loop through dist */ if (dist[j][0] == doc_ad->weight) { /* if found the same wdf */ dist[j][1] = dist[j][1] + 1; /* add 1 to the doc. frequency */ found = 1; docs_with_term++; } } /* ending for */ if (found == 0) { /* if not found the same wdf in dist */ /* start new row in dist */ dist[i][0] = doc_ad->weight; dist[i][1] = 1; i++; docs_with_term++; } } /* ending else */ doc_ad = doc_ad->nextdoc;/* loop through other documents for same term */ } /* ending while */ /* ending if */ /* now print out actual distribution information for this term */ (void) fprintf(output,"\nTerm = %s\n",inv_addr->term); (void) fprintf(output,"\nActual Distribution: "); (void) fprintf(output," WDF #docs\n"); (void) fprintf(output," 0 %d\n",Number_of_docs-docs_with_term); for (j=0;j<i;j++) (void) fprintf(output," %-16.0f %-6.0f\n",dist[j][0],dist[j][1]); /* PART II: */ /* computing lambda - the only single Poisson parameter */ (void) fprintf(output,"\nExpected Distribution: "); (void) fprintf(output," WDF #docs\n"); /* call the function total_wdf to compute the total frequency of the term */ lambda = (total_wdf(inv_addr->term))/Number_of_docs; first = exp(-lambda); numdocs = -1; j = -1; /* computing document frequency for each within document frequency value */ while (numdocs != 0) { j = j + 1; exponent = j; /* type conversion necessary for pow function */ if (j == 0) result = first * Number_of_docs; else { second = pow(lambda,exponent); result = (((first * second)/factorial(j)) * Number_of_docs); } if ((result - floor(result)) < 0.5) numdocs = floor(result); else numdocs = ceil(result); (void) fprintf (output," %-16d %-6d\n",j,numdocs); } inv_addr = inv_addr->next; /* continue with the next inverted file term */ } /* end while */ } /*************************************************************************** factorial(n) Returns: float Purpose: Return the factorial of a number. Used in get_poisson_dist **/ static float factorial(n) int n; /* in: compute factorial for this parameter */ { float answer; /* holds the result */ if (n==1) return(1.0); answer = factorial(n-1)*n; return(answer); } /*************************************************************************** total_wdf(term) Returns: float Purpose: Compute total frequency in the database for a specified term using the inverted file. Notes: It is assumed that the appropriate term frequency is stored as the term weight in the inverted file. This routine can also be used to filter out the low and high frequency terms. The resulting mid frequency terms can be used as input to the program which generates hierarchies. **/ static float total_wdf(term) char term[MAXWORD]; /* in: term for which total frequency is to be found */ { struct invert *inv_addr; /* tracks current inverted file record */ struct doclist *doc_addr; /* tracks current doclist record */ float total; /* tracks the total frequency */ total = 0.0; /* initial value */ /* function find_term will find out where the term is in the inverted file */ inv_addr = find_term(term); if (inv_addr) { /* if this term was found in the inverted file */ doc_addr = inv_addr->doc; /* get first associated document address */ while (doc_addr) { /* update the total frequency */ total = total + doc_addr->weight; doc_addr = doc_addr->nextdoc; /* loop through other associated docs. */ } } return(total); } /*************************************************************************** Partition_terms() Returns: void Purpose: Assign each term in the inverted file to one class: HIGH, MEDIUM or LOW frequency depending upon its total frequency in the collection. This function utilizes two parameters defined at the top of the program: LOW_THRESHOLD and HIGH_THRESHOLD, which should be set by the user. **/ static void Partition_terms() { struct invert *inv_addr; /* tracks address of current inverted file record */ float total; /* holds total frequency of each term */ inv_addr = startinv; /* start at the beginning of the inverted file */ (void) fprintf(output,"\nTerm - Total Frequency - Frequency Class\n\n"); while (inv_addr) { /* if a legitimate address */ /* compute total frequency for term in collection */ total = total_wdf(inv_addr->term); (void) fprintf(output,"\n%s - %f -",inv_addr->term,total); if (total < LOW_THRESHOLD) (void) fprintf(output," LOW\n"); else if (total > HIGH_THRESHOLD) (void) fprintf(output," HIGH\n"); else (void) fprintf(output," MEDIUM\n"); inv_addr = inv_addr->next; /* continue with next inverted file entry */ } } /*************************************************************************** cohesion(term1,term2) Returns: float Purpose: Compute the cohesion between two terms **/ static float cohesion(term1,term2) char term1[MAXWORD], /* in: the two terms which are being */ term2[MAXWORD]; /* in: compared to determine cohesion */ { float l1, /* holds # of documents associated with term1 */ l2, /* holds # of documents associated with term2 */ common; /* holds # of documents in common */ get_term_data(term1,term2,&l1,&l2,&common); return(common/(sqrt(l1 * l2))); } /*************************************************************************** dv_all() Returns: void Purpose: Compute Discrimination Value (DV) for all terms in the database Notes: Similarity between two documents as calculated here is a function of the number of terms in common and the number of terms in each. Term weights are not involved **/ static void dv_all() { struct invert *inv_addr; /* tracks address of current inv. file record */ float DV, /* holds computed DV */ baseline; /* holds baseline similarity */ /* first compute baseline similarity */ baseline = av_doc_similarity("-"); /* the dummy term '-' is used for this */ (void) fprintf(output,"-----------------------------------------\n"); (void) fprintf(output,"TERM DV\n"); (void) fprintf(output,"-----------------------------------------\n"); inv_addr = startinv; /* begin at top of inverted file */ while (inv_addr) { /* if legitimate inverted file record */ DV = av_doc_similarity(inv_addr->term) - baseline; (void) fprintf(output,"%-30s %-10.5f \n",inv_addr->term,DV); inv_addr = inv_addr->next; /* go to next inverted file record */ } } /*************************************************************************** av_doc_similarity(term) Returns: float Purpose: Compute average similarity between each document and the centroid of the database. The word specified in term is ignored during computations. **/ static float av_doc_similarity(term) char term[MAXWORD]; /* in: term is ignored during computations */ { struct direct *dir_addr; /* tracks current direct file record */ float dl1, /* holds # of terms in document */ dl2, /* holds # of terms in centroid */ common, /* holds # of terms in common between them */ total_sim; /* holds total similarity between them */ total_sim = 0.0; dir_addr = startdir; /* begin with first direct file record */ while (dir_addr) { /* get_doc_data returns #of terms in each document and #of terms in common */ get_doc_data(dir_addr,start_centroid,&dl1,&dl2,&common,term); total_sim = total_sim + cosine(dl1,dl2,common); dir_addr = dir_addr->next; /* go to next direct file record */ } return(total_sim/Number_of_docs); } /*************************************************************************** dice(term1,term2) Returns: float Purpose: Returns Dice's coefficient of similarity between any two documents **/ static float dice(term1,term2) char term1[MAXWORD], /* in: the two terms that are being compared */ term2[MAXWORD]; /* in: */ { float l1, l2, common; get_term_data(term1,term2,&l1,&l2,&common); if (l1 == 0 || l2 == 0) return(0.0); return(common/(l1 + l2)); } /*************************************************************************** cosine(l1,l2,common) Returns: float Purpose: Returns cosine similarity between two documents **/ static float cosine(l1,l2,common) float l1, /* in: # of terms associated with document 1 */ l2, /* in: # of terms associated with document 2 */ common; /* in: # of terms in common between them */ { float temp; if (l1 == 0 || l2 == 0) return(0.0); temp = sqrt(l1 * l2); return(common/temp); } /*************************************************************************** get_doc_data(p,q,l1,l2,common,index) Returns: void Purpose: Given two document numbers, it determines the number of of index terms in each and in common. It will exclude the index term (specified as the last parameter) from consideration. Used in av_doc_similarity for DV calculations. **/ static void get_doc_data(p,q,l1,l2,common,index) char index[MAXWORD]; /* in: term to be excluded from computations */ struct direct *p, *q; /* in: addresses of two documents numbers */ float *l1, *l2; /* out: number of terms in each document */ float *common; /* out: number of terms in common */ { struct termlist *term_addr1, /* holds address of first docs. termlist */ *term_addr2; /* holds address of second docs. termlist */ int count1, /* number of terms in first doc. */ count2, /* number of terms in second doc. */ com; /* number of terms in common */ term_addr1 = p->terms; count1 = 0; count2 = 0; com = 0; /* first find out number of terms in document 1 & # of common terms */ while (term_addr1) { if (strncmp(term_addr1->term,index,MAXWORD)) count1 = count1 + 1; /* if its not the term to exclude */ term_addr2 = q->terms; while (term_addr2) { if (!strncmp(term_addr1->term,term_addr2->term,MAXWORD)) { /* if the two terms are the same */ if (strncmp(term_addr1->term,index,MAXWORD)) com = com + 1; /* if they do not match the term to exclude */ break; } term_addr2 = term_addr2->nextterm; } term_addr1 = term_addr1->nextterm; } /* now find out number of terms in document 2 */ term_addr2 = q->terms; while (term_addr2) { if (strncmp(term_addr2->term,index,MAXWORD)) count2 = count2 + 1; term_addr2 = term_addr2->nextterm; } *l1 = count1; *l2 = count2; *common = com; } /*************************************************************************** get_term_data(term1,term2,l1,l2,common) Returns: void Purpose: Get info regarding number of documents in common between any two terms. **/ static void get_term_data(term1,term2,l1,l2,common) char term1[MAXWORD], /* in: term 1 to be compared with */ term2[MAXWORD]; /* in: term 2 */ float *l1, /* out: # of documents associated with 1st term */ *l2, /* out: # of documents associated with 2nd term */ *common; /* out: # of documents in common between them */ { struct invert *p, /* holds address of first term in inverted file */ *q; /* holds address of second term in inv. file */ struct doclist *doc_ad1,/* tracks doclist for first term */ *doc_ad2;/* tracks doclist for second term */ int count1, /* holds # of documents associated with 1st term */ count2, /* holds # of documents associated with 2nd term */ com; /* holds # of documents common between them */ /* find addresses of both terms in the inverted file */ p = find_term(term1); q = find_term(term2); doc_ad1 = p->doc; /* obtain 1st terms doclist address */ count1 = 0; count2 = 0; com = 0; /* first get # of documents indexed by term 1 & # of docs. in common */ while (doc_ad1) { count1 = count1 +1; doc_ad2 = q->doc; while (doc_ad2) { if (doc_ad1->doc == doc_ad2->doc) { /* if the document numbers are the same */ com = com +1; break; } doc_ad2 = doc_ad2->nextdoc; } doc_ad1 = doc_ad1->nextdoc; } /* now get # of documents indexed by term 2 */ doc_ad2 = q->doc; while (doc_ad2) { count2 = count2 + 1; doc_ad2 = doc_ad2->nextdoc; } *l1 = count1; *l2 = count2; *common = com; } /*************************************************************************** *find_term(term) Returns: address of a struct invert record Purpose: search for a specified term in the inverted file & return address of the record **/ struct invert *find_term(term) char term[MAXWORD]; /* in: term to be located in inverted file */ { struct invert *inv_addr; /* tracks addr. of current rec. in inv. file */ inv_addr = startinv; /* begin at top of inv. file */ while(inv_addr) { if (!strcmp(term,inv_addr->term)) {return(inv_addr);} inv_addr = inv_addr->next; } (void) fprintf(output,"Findterm routine: Term %s not found\n",term); return(NULL); } /*************************************************************************** *get_mem_direct() Returns: address of a struct direct record Purpose: dynamically obtain enough memory to store 1 direct record **/ static struct direct *get_mem_direct() { struct direct *record; record = (struct direct *)malloc(sizeof(directfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); exit(0); } return(record); } /*************************************************************************** *get_mem_termlist() Returns: address of a struct termlist record Purpose: dynamically obtain enough memory to store one termlist record **/ static struct termlist *get_mem_termlist() { struct termlist *record; record = (struct termlist *)malloc(sizeof(termlistfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); exit(0); } return(record); } /*************************************************************************** *get_mem_invert() Returns: address of a struct invert record Purpose: dynamically obtain enough memory to store one inverted file record **/ static struct invert *get_mem_invert() { struct invert *record; record = (struct invert *)malloc(sizeof(invfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); exit(0); } return(record); } /*************************************************************************** *get_mem_doclist() Returns: address of a struct doclist record Purpose: dynamically obtain enough memory to store one doclist record **/ static struct doclist *get_mem_doclist() { struct doclist *record; record = (struct doclist *)malloc(sizeof(doclistfile)); if (!record) { (void) (void) fprintf(output,"\nout of memory\n"); exit(0); } return(record); }